Bank Marketing

Goal for the project is to Predict whether a client will subscribe (yes / no) to a term deposit, this based on data from a Portuguese bank’s direct marketing campaigns conducted via phone calls.

The dataset is structured and suitable for models like Logistic Regression, LDA, QDA, KNN, Random Forest, and others.

Objectives

  • Build and interpret a logistic regression model without interaction terms.
  • Use EDA and domain intuition (not algorithmic feature selection) to guide variable inclusion.
  • Interpret regression coefficients and confidence intervals, focusing on how key predictor variables influence the likelihood of subscription.
  • Distinguish between statistical significance (p-values, CIs) and practical significance (effect magnitude and meaning).
  • Use AIC as the primary model comparison tool during training, ensuring the model remains interpretable and grounded in insights.

Exploratory Data Analysis

data <- read.csv("./bank-full.csv", header = TRUE, sep = ";", stringsAsFactors = TRUE)

bank <- data |> rename(subscribed = y)
num_rows <- nrow(bank)
num_cols <- ncol(bank) - 1

data_summary <- data.frame(
  Characteristic = c("Number of Rows", "Number of Columns", "Number of Predictors", "Target Variable"),
  Value = c(num_rows, num_cols, num_cols, "subscribed")
)

data_summary

Summary Statistics

summary(bank)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day            month          duration         campaign     
##  Min.   : 1.00   may    :13766   Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   jul    : 6895   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   aug    : 6247   Median : 180.0   Median : 2.000  
##  Mean   :15.81   jun    : 5341   Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00   nov    : 3970   3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00   apr    : 2932   Max.   :4918.0   Max.   :63.000  
##                  (Other): 6060                                    
##      pdays          previous           poutcome     subscribed 
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   no :39922  
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   yes: 5289  
##  Median : -1.0   Median :  0.0000   success: 1511              
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959              
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                              
##  Max.   :871.0   Max.   :275.0000                              
## 
str(bank)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age       : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job       : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
##  $ marital   : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
##  $ education : Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
##  $ default   : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ balance   : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing   : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ loan      : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ contact   : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day       : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month     : Factor w/ 12 levels "apr","aug","dec",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ duration  : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays     : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome  : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ subscribed: Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

Missing analysis

This data-set contains no empty values at first sight.

colSums(is.na(bank))
##        age        job    marital  education    default    balance    housing 
##          0          0          0          0          0          0          0 
##       loan    contact        day      month   duration   campaign      pdays 
##          0          0          0          0          0          0          0 
##   previous   poutcome subscribed 
##          0          0          0

Separating Numerical vs Categorical

numeric_vars <- names(bank)[sapply(bank, is.numeric)]
categorical_vars <- names(bank)[sapply(bank, function(x) is.factor(x) || is.character(x))]

cat("Numeric variables:\n")
## Numeric variables:
print(numeric_vars)
## [1] "age"      "balance"  "day"      "duration" "campaign" "pdays"    "previous"
cat("Categorical variables:\n")
## Categorical variables:
print(categorical_vars)
##  [1] "job"        "marital"    "education"  "default"    "housing"   
##  [6] "loan"       "contact"    "month"      "poutcome"   "subscribed"

Plot of missing data

Visual confirmation for emptyness search, no data is missing in this data set

vis_miss(bank) +
  labs(
    title = "Visualizing Missing Data",
    x = "",
    y = ""
  ) +
  theme(
    plot.title = element_text(size = 8, face = "bold"),
    plot.subtitle = element_text(size = 8),
    axis.text.x = element_text(angle = 90, hjust = 1)
  )

Find some relationships

plt_subscribed <- bank |>
  group_by(subscribed) |>
  summarise(cnt = n()) |>
  mutate(perc = round(cnt / sum(cnt), 4))

plt_prop <- ggplot(plt_subscribed, aes(x = subscribed, y = perc, colour = subscribed)) +
  geom_bar(aes(fill = subscribed), show.legend = FALSE, stat = "identity") +
  ylab("Proportion of Subscribed")

grid.arrange(grobs = list(tableGrob(plt_subscribed), plt_prop), ncol = 1)

categorical_vars_plt <- categorical_vars[categorical_vars != "subscribed"]

plt_categorical <- lapply(seq_along(categorical_vars_plt), function(i) {
  ggplot(bank, aes_string(x = categorical_vars_plt[i], fill = "subscribed")) +
    geom_bar(position = "fill") +
    scale_y_continuous(labels = scales::percent) +
    scale_fill_discrete() +
    labs(title = paste("Subscription Rate by", categorical_vars_plt[i]),
         y = "Proportion", x = NULL) +
    coord_flip() +
    theme(legend.position = if (i == 1) "bottom" else "none")
})

wrap_plots(plt_categorical, ncol = 3, guides = "collect") &  theme(legend.position = "bottom")

plt_num <- lapply(numeric_vars, function(var) {
  ggplot(bank, aes_string(x = "subscribed", y = var, fill = "subscribed")) +
    geom_boxplot(alpha = 0.7) +
    scale_fill_discrete() +
    labs(title = paste("Dist. of", var, "by Subscribed"), y = var, x = NULL)
})

wrap_plots(plt_num, ncol = 3) & theme(legend.position = "none")

Outlier assesment

for_outliers <- bank
for_outliers$subscribed <- ifelse(bank$subscribed == "yes", 1, 0)
model <- lm(subscribed ~ previous, data = for_outliers)
influencePlot(model, id.method = "identify", 
              main = "Influence Plot: Cook's D vs Leverage",
              sub = "Size of circle = Cook's distance")

Plot the most influential

cooks_d <- cooks.distance(model)
N <- 5
top_influential_df <- data.frame(
  index = 1:length(cooks_d),
  cooks_distance = cooks_d,
  previous = bank$previous,
  subscribed = bank$subscribed
) |>
  arrange(desc(cooks_distance)) |>
  slice(1:N)

ggplot(top_influential_df, aes(x = reorder(as.factor(index), -cooks_distance), y = cooks_distance)) +
  geom_col(fill = "steelblue") +
  geom_text(aes(label = round(cooks_distance, 4)), vjust = -0.5, size = 3.5) +
  labs(
    title = paste("Top", N, "Influential Points (Cook's Distance)"),
    x = "Observation Index",
    y = "Cook's Distance"
  )

Observation 29183 has extremely high leverage and Cook’s Distance (~44.3), making it a highly influential point in the model.

Remove Observation

bank <- bank[-29183, ]

Correlation

for_corr <- bank
for_corr$subscribed <- ifelse(bank$subscribed == "yes", 1, 0)

vars_corr <- names(for_corr)[sapply(for_corr, is.numeric)]
corr_df <- for_corr[vars_corr]

cor_matrix <- cor(corr_df, use = "complete.obs")
subscribed_cor <- cor_matrix[, "subscribed", drop = FALSE]
subscribed_cor <- subscribed_cor[order(abs(subscribed_cor[,1]), decreasing = TRUE), , drop = FALSE]

cor_df <- data.frame(
  variable = rownames(subscribed_cor),
  correlation = subscribed_cor[,1]
)

ggplot(cor_df, aes(x = reorder(variable, correlation), y = correlation)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Correlation with Subscribed", x = "Variable", y = "Correlation")

PCA

for_pca <- bank
for_pca <- bank[sapply(data, is.numeric)]

pca <- prcomp(for_pca, scale. = TRUE)
autoplot(pca, data = bank, colour = 'subscribed', loadings = TRUE, loadings.label = TRUE) +
  labs(title = "PCA")

loadings <- as.data.frame(pca$rotation)
loadings$variable <- rownames(loadings)

loadings$PC1.ABS <- abs(loadings$PC1)
loadings$PC2.ABS <- abs(loadings$PC2)

top_pc1 <- loadings[order(-loadings$PC1.ABS), c("variable", "PC1")][1:7, ]
top_pc2 <- loadings[order(-loadings$PC2.ABS), c("variable", "PC2")][1:7, ]

top_combined <- data.frame(
  PC1_Variable = top_pc1$variable,
  PC1_Loading = round(top_pc1$PC1, 3),
  PC2_Variable = top_pc2$variable,
  PC2_Loading = round(top_pc2$PC2, 3)
)

top_combined

Principal Component Analysis (PCA) on the numeric variables revealed that pdays and previous contributed most to the first principal component (PC1), capturing variability related to past campaign exposure. The second component (PC2) was primarily influenced by campaign, day, and negatively by duration, reflecting variation in campaign intensity and contact timing.

We can interpret the components as:

  • PC1: Differences in past contact history, primarily driven by pdays and previous.
  • PC2: Differences in campaign intensity and call timing, shaped by campaign, day, and negatively by duration.
eigenvals <- pca$sdev^2
plot(eigenvals / sum(eigenvals), type = "l", main = "Scree Plot", ylab = "Prop. Var. Explained", xlab = "PC #", ylim = c(0, 1))
cumulative.prop <- cumsum(eigenvals / sum(eigenvals))
lines(cumulative.prop, lty = 2)

eigenvals <- pca$sdev^2
prop_var <- eigenvals / sum(eigenvals)
cum_var <- cumsum(prop_var)
pc_table <- data.frame(
  PC = paste0("PC", 1:length(prop_var)),
  "Variance Explained" = round(prop_var, 4),
  "Cumulative Variance" = round(cum_var, 4)
)

pc_table
pca_scores <- as.data.frame(pca$x)
pca_scores$subscribed <- bank$subscribed

plot_ly(
  data = pca_scores,
  x = ~PC1, y = ~PC2, z = ~PC3,
  color = ~subscribed,
  colors = c("red", "deepskyblue"),
  type = "scatter3d",
  mode = "markers"
)

It appears that we’re missing a significant portion of the variance by focusing only on the numeric variables. PC1 and PC2 explain the most variation among these, but even after adding PC3, we don’t observe meaningful separation between subscription outcomes. This suggests that additional structure — possibly critical for understanding or predicting subscribed — may lie in the categorical variables, which were not included in this PCA.

Logistic Regression

Run GLM in each predictor

Let’s understand a bit how the numerals contribute to explain the subscription

for_glm <- bank
for_glm$subscribed <- ifelse(for_glm$subscribed == "yes", 1, 0)

all_num_additive <- glm(subscribed ~ duration + pdays + previous + campaign + balance + age + day, data = for_glm, family = binomial)
summary(all_num_additive)
## 
## Call:
## glm(formula = subscribed ~ duration + pdays + previous + campaign + 
##     balance + age + day, family = binomial, data = for_glm)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.472e+00  7.700e-02 -45.085  < 2e-16 ***
## duration     3.643e-03  5.648e-05  64.500  < 2e-16 ***
## pdays        1.964e-03  1.552e-04  12.654  < 2e-16 ***
## previous     1.006e-01  7.340e-03  13.709  < 2e-16 ***
## campaign    -1.292e-01  9.609e-03 -13.445  < 2e-16 ***
## balance      3.704e-05  4.294e-06   8.625  < 2e-16 ***
## age          7.910e-03  1.473e-03   5.370 7.88e-08 ***
## day         -1.661e-03  2.014e-03  -0.825    0.409    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32631  on 45209  degrees of freedom
## Residual deviance: 26464  on 45202  degrees of freedom
## AIC: 26480
## 
## Number of Fisher Scoring iterations: 6

Plot Logistic on Numericals

tidy(all_num_additive, conf.int = TRUE) |>
  filter(term != "(Intercept)") |>
  ggplot(aes(x = reorder(term, estimate), y = estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.1) +
  coord_flip() +
  labs(title = "Logistic Regression Coefficients",
       y = "Estimate (log-odds)", x = "Variable")

Look for signals in categorical

Modeling with all categoricals might tell some story

bank_cat <- bank
cat_model_vars <- setdiff(categorical_vars, "subscribed")
model_glm_cat <- as.formula(paste("subscribed ~", paste(cat_model_vars, collapse = " + ")))

glm_cats <- glm(model_glm_cat, data = bank_cat, family = binomial)
summary(glm_cats)
## 
## Call:
## glm(formula = model_glm_cat, family = binomial, data = bank_cat)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -1.24333    0.10655 -11.669  < 2e-16 ***
## jobblue-collar     -0.12464    0.06490  -1.921 0.054785 .  
## jobentrepreneur    -0.19100    0.11110  -1.719 0.085581 .  
## jobhousemaid       -0.28253    0.11968  -2.361 0.018238 *  
## jobmanagement      -0.04697    0.06561  -0.716 0.474084    
## jobretired          0.46523    0.07788   5.974 2.32e-09 ***
## jobself-employed   -0.09072    0.09911  -0.915 0.360040    
## jobservices        -0.08617    0.07451  -1.156 0.247478    
## jobstudent          0.33069    0.09770   3.385 0.000713 ***
## jobtechnician      -0.06466    0.06187  -1.045 0.295967    
## jobunemployed       0.13108    0.09768   1.342 0.179626    
## jobunknown         -0.19401    0.20780  -0.934 0.350479    
## maritalmarried     -0.20629    0.05158  -4.000 6.35e-05 ***
## maritalsingle       0.08241    0.05554   1.484 0.137862    
## educationsecondary  0.15081    0.05652   2.668 0.007627 ** 
## educationtertiary   0.31779    0.06570   4.837 1.32e-06 ***
## educationunknown    0.20162    0.09242   2.182 0.029137 *  
## defaultyes         -0.15693    0.14688  -1.068 0.285318    
## housingyes         -0.54527    0.03810 -14.313  < 2e-16 ***
## loanyes            -0.40804    0.05303  -7.694 1.43e-14 ***
## contacttelephone   -0.28127    0.06399  -4.395 1.11e-05 ***
## contactunknown     -1.34752    0.06337 -21.263  < 2e-16 ***
## monthaug           -0.97784    0.06846 -14.284  < 2e-16 ***
## monthdec            0.57038    0.16198   3.521 0.000429 ***
## monthfeb           -0.44644    0.07501  -5.952 2.65e-09 ***
## monthjan           -1.08345    0.10618 -10.203  < 2e-16 ***
## monthjul           -0.79694    0.06766 -11.779  < 2e-16 ***
## monthjun            0.10830    0.08091   1.338 0.180741    
## monthmar            1.06567    0.11027   9.664  < 2e-16 ***
## monthmay           -0.50694    0.06341  -7.995 1.30e-15 ***
## monthnov           -0.83485    0.07443 -11.216  < 2e-16 ***
## monthoct            0.68172    0.09776   6.973 3.10e-12 ***
## monthsep            0.65425    0.10739   6.092 1.11e-09 ***
## poutcomeother       0.25429    0.07960   3.194 0.001401 ** 
## poutcomesuccess     2.26565    0.07345  30.848  < 2e-16 ***
## poutcomeunknown     0.03496    0.05155   0.678 0.497693    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32631  on 45209  degrees of freedom
## Residual deviance: 27296  on 45174  degrees of freedom
## AIC: 27368
## 
## Number of Fisher Scoring iterations: 6
Reference levels
X_cat <- model.matrix(model_glm_cat, data = bank_cat)

reference_lvls <- data.frame(
  Variable = cat_model_vars,
  Reference = sapply(bank_cat[cat_model_vars], function(x) levels(x)[1])
) |> tibble::as_tibble()


reference_lvls
Odds Ratio
odds_ratios <- data.frame(
  Variable = names(coef(glm_cats)),
  Odds_Ratio = exp(coef(glm_cats))
) |> dplyr::mutate(Effect = paste0(round((Odds_Ratio - 1) * 100, 1), "%")) |>
  dplyr::mutate(Odds_Ratio = round(Odds_Ratio, 3)) |>
  dplyr::arrange(desc(Odds_Ratio)) |>
  tibble::as_tibble()

odds_ratios

We can see from the categorical only variables that:

  • Month of contact has a strong influence: campaigns conducted in March, October, and September significantly increased subscription odds, while those in January and August showed markedly lower performance.
  • Job type and education level also shape likelihood: customers who are retired or students, and those with tertiary education, are notably more likely to subscribe.
  • Additionally, having an existing loan appears to negatively impact subscription odds, suggesting financial burden may reduce campaign receptiveness.
Variable Visual Pattern? Clear % Difference? Keep?
contact yes yes (cellular > unknown) yes
loan yes yes (loan = less likely) yes
housing yes yes (housing = likely) yes
default maybe some maybe
education yes yes (tertiary increases) yes
poutcome strong yes (success = very high) yes
marital maybe some separation maybe
job mixed a few clear signals maybe (group rare levels)

Carefully look at poutcome as we do not know what drives from previous mkt approach, and if the customer is showing an affinity to long term deposit, maybe is increasing the current deposit. Default sounds like a good story, I tried swapping ref with not much difference.

Feature Selection (By EDA)

We are having very conflicting results based on the multiple explorations on numerical, that is telling us, that we need categorical variables to play a role in the explainability.

We think that cyclical encoding for month as we see some patterns on specific months could help the model to explain better as seasonality seems to have some effect.

Variable Type Reason for Inclusion
duration Numerical Strongest univariate predictor; higher durations consistently increase subscription odds
pdays Numerical Captures time since last contact; reflects engagement recency
previous Numerical Reflects past campaign success; useful but may be redundant with pdays/campaign
balance Numerical Indicates client financial status; moderate predictive signal
campaign Numerical Current campaign intensity; negative association suggests fatigue with repeated contact
month_sin Numerical Cyclical encoding of month (seasonality); preserves circular month structure
month_cos Numerical Complement to month_sin; together capture monthly cyclic patterns
contact Categorical Clear visual and statistical difference; contact method affects likelihood to subscribe
loan Categorical Customers with loans are less likely to subscribe; simple and interpretable
education Categorical Higher education levels (tertiary) correlate with higher subscription odds
marital Categorical Some variation observed; potentially useful with clear reference level
job Categorical Certain job roles (retired, student) show increased subscription; use with level grouping

Prepare model

Will do the cyclical encoding for month and dummy variables (as they are factors GLM will dummy them)

candidate_data <- bank
candidate_data$month_num <- as.numeric(factor(candidate_data$month, levels = c(
  "jan", "feb", "mar", "apr", "may", "jun",
  "jul", "aug", "sep", "oct", "nov", "dec"
)))

candidate_data$month_sin <- sin(2 * pi * candidate_data$month_num / 12)
candidate_data$month_cos <- cos(2 * pi * candidate_data$month_num / 12)


candidate_data <- candidate_data |> dplyr::select(-month)
head(candidate_data)
split_rate <- 0.7
split <- sample(1:nrow(candidate_data), split_rate * nrow(candidate_data))
train_data <- candidate_data[split, ]
test_data  <- candidate_data[-split, ]

#num_feat <- c("duration", "poutcome", "pdays", "balance", "default", "housing", "campaign", "month_sin", "month_cos")
num_feat <- c("duration", "balance", "campaign", "month_sin", "month_cos")
cat_feat <- c("contact", "loan", "education", "marital", "job", "day", "previous", "poutcome", "housing")

features <- c(num_feat, cat_feat)
candidate_model <- as.formula(paste("subscribed ~", paste(features, collapse = " + ")))

GLM

threshold <- 0.25
candidate_fit <- glm(candidate_model, data = train_data, family = binomial)
glm_pred <- predict(candidate_fit, newdata = test_data, type = "response")

model_levels <- levels(candidate_data$subscribed)
pred_class <- factor(ifelse(glm_pred > threshold, "yes", "no"), levels = model_levels)
glm_actual <- factor(test_data$subscribed, levels = model_levels)

confusionMatrix(pred_class, glm_actual, positive = "yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  11144   686
##        yes   812   922
##                                           
##                Accuracy : 0.8896          
##                  95% CI : (0.8842, 0.8948)
##     No Information Rate : 0.8815          
##     P-Value [Acc > NIR] : 0.001670        
##                                           
##                   Kappa : 0.4889          
##                                           
##  Mcnemar's Test P-Value : 0.001239        
##                                           
##             Sensitivity : 0.57338         
##             Specificity : 0.93208         
##          Pos Pred Value : 0.53172         
##          Neg Pred Value : 0.94201         
##              Prevalence : 0.11855         
##          Detection Rate : 0.06797         
##    Detection Prevalence : 0.12784         
##       Balanced Accuracy : 0.75273         
##                                           
##        'Positive' Class : yes             
## 

Tune thresholds

thresholds <- seq(0.1, 0.9, by = 0.01)

metrics_df <- purrr::map_dfr(thresholds, function(thresh) {
  pred_class <- factor(ifelse(glm_pred > thresh, "yes", "no"), levels = c("no", "yes"))
  
  tibble(
    threshold = thresh,
    precision = yardstick::precision_vec(truth = glm_actual, estimate = pred_class, event_level = "second"),
    recall = yardstick::recall_vec(truth = glm_actual, estimate = pred_class, event_level = "second"),
    f1 = yardstick::f_meas_vec(truth = glm_actual, estimate = pred_class, event_level = "second")
  )
})

ggplot(metrics_df, aes(x = threshold)) +
  geom_line(aes(y = f1), color = "blue") +
  geom_line(aes(y = precision), color = "green") +
  geom_line(aes(y = recall), color = "red") +
  labs(title = "Threshold Tuning", y = "Metric", x = "Threshold")

Misclassification spot-check

test_data_aug <- test_data %>%
  dplyr::mutate(
    pred_prob = glm_pred,
    pred_class = factor(ifelse(glm_pred > threshold, "yes", "no"), levels = c("no", "yes")),
    subscribed = factor(subscribed, levels = c("no", "yes")),
    result = case_when(
      subscribed == "yes" & pred_class == "yes" ~ "TP",
      subscribed == "no" & pred_class == "yes" ~ "FP",
      subscribed == "yes" & pred_class == "no" ~ "FN",
      subscribed == "no" & pred_class == "no" ~ "TN"
    )
  )

ggplot(test_data_aug, aes(x = duration, y = balance, color = result)) +
  geom_point(alpha = 0.4) +
  labs(title = "False Positives vs. True Positives",
       subtitle = paste("Threshold:", threshold),
       color = "Prediction Outcome")

test_data$subscribed <- factor(test_data$subscribed, levels = c("no", "yes"))

pr_df <- tibble(
  subscribed = test_data$subscribed,
  .pred_yes = glm_pred
)

pr_curve(pr_df, truth = subscribed, .pred_yes) %>%
  autoplot() +
  labs(
    title = "Precision-Recall Curve",
    subtitle = "Probability thresholds for predicting 'yes'"
  )
## Registered S3 method overwritten by 'parsnip':
##   method          from     
##   autoplot.glmnet ggfortify

Prediction - Objective 2

Quick inspection for potential complexity

bank_loess <- bank
bank_loess$subscribed <- ifelse(bank_loess$subscribed == "yes", 1, 0)
plt_numeric_vars <- setdiff(numeric_vars, c())

plots <- lapply(plt_numeric_vars, function(var) {
  ggplot(bank_loess, aes_string(x = var, y = "subscribed", colour = var)) +
    geom_point() +
    geom_smooth(method = "loess", se = FALSE, size = 1, span = 1.5) +
    ylim(-.2, 1.2) +
    labs(title = paste("Subscription vs", var), y = "Subscription Rate", x = var)
})
grid.arrange(grobs = plots, ncol = 3)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Variable Shape Interpretation Suggestion
age U-shape → increasing Older clients tend to subscribe more; age shows a non-linear effect Include age^2
balance Inverted U-shape Clients with mid-range balances are more likely to subscribe Use balance + balance^2
duration Logistic-like rise then plateau Longer calls are associated with higher subscription rates, with signs of saturation Consider log(duration + 1) or natural spline
campaign Shallow U-shape Additional contacts may reduce effectiveness after a point Keep as linear or bin into tiers
pdays Flat with mild positive curvature Values like -1 dominate; higher values show a weak increasing trend Cap at 300 or bin; optionally include pdays == -1 indicator
previous Curved rise then decline Moderate prior contact increases success; too many may reduce effectiveness Use previous + previous^2 or log(previous + 1)

Prepare Complexity

Look for some interactions

#'  By domain knowledge we explore couple interactions.
#'  - duration × poutcome: Duration and previous outcome can drive success
#'  - duration × contact: Communication channel can affect duration of calls
#'  - job × education: Job and education can have strong relationships
#'  - housing × loan: Debt-free can drive decisions regarding financial risks

inter_data <- candidate_data
inter_data$subscribed <- ifelse(inter_data$subscribed == "yes", 1, 0)

iner_job_data <- inter_data |>
  group_by(job, education) |>
  summarise(subscribe_rate = mean(subscribed), .groups = "drop")

housing_loan_data <- inter_data %>%
  group_by(housing, loan) %>%
  summarise(rate = mean(subscribed), .groups = "drop")


plt_outcome <- ggplot(inter_data, aes(y = subscribed, x = duration, colour = poutcome)) +
  geom_point() +
  labs(
    title = "Subscription Rate by Duration × Outcome",
  ) +
  geom_smooth(method = "loess", size = 1, span = .5) +
  ylim(-.2, 1)


plt_dur_ct <- ggplot(inter_data, aes(y = subscribed, x = duration, colour = contact)) +
  geom_point() +
  labs(
    title = "Subscription Rate by Duration × Contact",
  ) +
  geom_smooth(method = "loess", size = 1, span = .5) +
  ylim(-.2, 1)


plt_job <- ggplot(iner_job_data, aes(x = education, y = job, fill = subscribe_rate)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "steelblue", name = "Subscription Rate") +
  labs(
    title = "Subscription Rate by Job × Education",
    x = "Education Level", y = "Job"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


plt_house_loan <- ggplot(housing_loan_data, aes(x = housing, y = rate, fill = loan)) +
  geom_col(position = "dodge") +
  labs(
    title = "Subscription Rate by Housing × Loan",
    x = "Housing Loan", y = "Subscription Rate"
  )

(plt_outcome + plt_dur_ct) / (plt_job + plt_house_loan)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

#' Local Variables
#' -  Setting threshold for all models - might need adjustment for each; but the goal is to compare in a fair ground
#' -  Train Control for all models, same cross validation. Can be set individually for each model
#' -  Set Reference Levels

threshold <- 0.20
model_levels <- levels(candidate_data$subscribed)

fitControl <- trainControl(
  method = "repeatedcv",
  number = 5,
  repeats = 1,
  classProbs = TRUE,
  summaryFunction = mnLogLoss
)

drop_num <- c("pdays")
drop_cat <- c("month", "subscribed")
no_transform <- c("month_sin", "month_cos", "day")

numeric_clean <- setdiff(numeric_vars, drop_num)
categorical_clean <- setdiff(categorical_vars, drop_cat)

existing_vars <- names(candidate_data)
direct_numeric <- intersect(setdiff(no_transform, drop_num), existing_vars)

# For GLM/LDA models
num_poly <- setdiff(numeric_clean, no_transform)
poly_terms <- paste0("poly(", num_poly, ", 2, raw = TRUE)")

interaction_terms <- c(
  "poly(duration, 2, raw = TRUE):poutcome",
  "poly(duration, 2, raw = TRUE):contact",
  "job:education",
  "housing:loan"
)

formula_terms_full <- c(poly_terms, direct_numeric, categorical_clean, interaction_terms)
candidate_formula_string <- paste("subscribed ~", paste(formula_terms_full, collapse = " + "))
candidate_model <- as.formula(candidate_formula_string)

# Random Forest version
formula_terms_rf <- intersect(c(numeric_clean, direct_numeric, categorical_clean), existing_vars)
rf_formula_string <- paste("subscribed ~", paste(formula_terms_rf, collapse = " + "))
rf_candidate_model <- as.formula(rf_formula_string)

candidate_model
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) + 
##     poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) + 
##     poly(previous, 2, raw = TRUE) + month_sin + month_cos + day + 
##     job + marital + education + default + housing + loan + contact + 
##     poutcome + poly(duration, 2, raw = TRUE):poutcome + poly(duration, 
##     2, raw = TRUE):contact + job:education + housing:loan
rf_candidate_model
## subscribed ~ age + balance + day + duration + campaign + previous + 
##     month_sin + month_cos + job + marital + education + default + 
##     housing + loan + contact + poutcome

Feature Selection

step_model <- stepAIC(
  glm(candidate_model, data = train_data, family = binomial),
  scope = list(lower = ~ month_cos, upper = candidate_model),
  direction = "both"
)
## Start:  AIC=14940.59
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) + 
##     poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) + 
##     poly(previous, 2, raw = TRUE) + month_sin + month_cos + day + 
##     job + marital + education + default + housing + loan + contact + 
##     poutcome + poly(duration, 2, raw = TRUE):poutcome + poly(duration, 
##     2, raw = TRUE):contact + job:education + housing:loan
## 
##                                          Df Deviance   AIC
## - job:education                          33    14817 14915
## - default                                 1    14777 14939
## - day                                     1    14778 14940
## <none>                                         14777 14941
## - poly(previous, 2, raw = TRUE)           2    14786 14946
## - poly(balance, 2, raw = TRUE)            2    14793 14953
## - housing:loan                            1    14794 14956
## - month_sin                               1    14795 14957
## - marital                                 2    14799 14959
## - poly(duration, 2, raw = TRUE):poutcome  6    14830 14982
## - poly(duration, 2, raw = TRUE):contact   4    14853 15009
## - poly(age, 2, raw = TRUE)                2    14857 15017
## - poly(campaign, 2, raw = TRUE)           2    14868 15028
## 
## Step:  AIC=14914.95
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) + 
##     poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) + 
##     poly(previous, 2, raw = TRUE) + month_sin + month_cos + day + 
##     job + marital + education + default + housing + loan + contact + 
##     poutcome + poly(duration, 2, raw = TRUE):poutcome + poly(duration, 
##     2, raw = TRUE):contact + housing:loan
## 
##                                          Df Deviance   AIC
## - default                                 1    14818 14914
## - day                                     1    14818 14914
## <none>                                         14817 14915
## - poly(previous, 2, raw = TRUE)           2    14827 14921
## - poly(balance, 2, raw = TRUE)            2    14834 14928
## - housing:loan                            1    14834 14930
## - month_sin                               1    14836 14932
## - marital                                 2    14840 14934
## + job:education                          33    14777 14941
## - education                               3    14854 14946
## - poly(duration, 2, raw = TRUE):poutcome  6    14870 14956
## - job                                    11    14887 14963
## - poly(duration, 2, raw = TRUE):contact   4    14895 14985
## - poly(age, 2, raw = TRUE)                2    14904 14998
## - poly(campaign, 2, raw = TRUE)           2    14910 15004
## 
## Step:  AIC=14913.48
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) + 
##     poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) + 
##     poly(previous, 2, raw = TRUE) + month_sin + month_cos + day + 
##     job + marital + education + housing + loan + contact + poutcome + 
##     poly(duration, 2, raw = TRUE):poutcome + poly(duration, 2, 
##     raw = TRUE):contact + housing:loan
## 
##                                          Df Deviance   AIC
## - day                                     1    14818 14912
## <none>                                         14818 14914
## + default                                 1    14817 14915
## - poly(previous, 2, raw = TRUE)           2    14827 14919
## - poly(balance, 2, raw = TRUE)            2    14835 14927
## - housing:loan                            1    14835 14929
## - month_sin                               1    14837 14931
## - marital                                 2    14840 14932
## + job:education                          33    14777 14939
## - education                               3    14854 14944
## - poly(duration, 2, raw = TRUE):poutcome  6    14871 14955
## - job                                    11    14888 14962
## - poly(duration, 2, raw = TRUE):contact   4    14895 14983
## - poly(age, 2, raw = TRUE)                2    14905 14997
## - poly(campaign, 2, raw = TRUE)           2    14910 15002
## 
## Step:  AIC=14912.42
## subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 2, raw = TRUE) + 
##     poly(duration, 2, raw = TRUE) + poly(campaign, 2, raw = TRUE) + 
##     poly(previous, 2, raw = TRUE) + month_sin + month_cos + job + 
##     marital + education + housing + loan + contact + poutcome + 
##     poly(duration, 2, raw = TRUE):poutcome + poly(duration, 2, 
##     raw = TRUE):contact + housing:loan
## 
##                                          Df Deviance   AIC
## <none>                                         14818 14912
## + day                                     1    14818 14914
## + default                                 1    14818 14914
## - poly(previous, 2, raw = TRUE)           2    14828 14918
## - poly(balance, 2, raw = TRUE)            2    14836 14926
## - housing:loan                            1    14836 14928
## - month_sin                               1    14838 14930
## - marital                                 2    14841 14931
## + job:education                          33    14778 14938
## - education                               3    14855 14943
## - poly(duration, 2, raw = TRUE):poutcome  6    14872 14954
## - job                                    11    14889 14961
## - poly(duration, 2, raw = TRUE):contact   4    14896 14982
## - poly(age, 2, raw = TRUE)                2    14906 14996
## - poly(campaign, 2, raw = TRUE)           2    14915 15005
selected_formula <- formula(step_model)
summary(step_model)
## 
## Call:
## glm(formula = subscribed ~ poly(age, 2, raw = TRUE) + poly(balance, 
##     2, raw = TRUE) + poly(duration, 2, raw = TRUE) + poly(campaign, 
##     2, raw = TRUE) + poly(previous, 2, raw = TRUE) + month_sin + 
##     month_cos + job + marital + education + housing + loan + 
##     contact + poutcome + poly(duration, 2, raw = TRUE):poutcome + 
##     poly(duration, 2, raw = TRUE):contact + housing:loan, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##                                                   Estimate Std. Error z value
## (Intercept)                                     -7.867e-01  3.369e-01  -2.335
## poly(age, 2, raw = TRUE)1                       -1.082e-01  1.220e-02  -8.871
## poly(age, 2, raw = TRUE)2                        1.240e-03  1.330e-04   9.326
## poly(balance, 2, raw = TRUE)1                    4.403e-05  1.035e-05   4.252
## poly(balance, 2, raw = TRUE)2                   -9.365e-10  3.193e-10  -2.933
## poly(duration, 2, raw = TRUE)1                   7.954e-03  5.747e-04  13.841
## poly(duration, 2, raw = TRUE)2                  -3.707e-06  4.450e-07  -8.332
## poly(campaign, 2, raw = TRUE)1                  -1.413e-01  1.653e-02  -8.550
## poly(campaign, 2, raw = TRUE)2                   2.799e-03  7.397e-04   3.783
## poly(previous, 2, raw = TRUE)1                   5.121e-02  2.153e-02   2.378
## poly(previous, 2, raw = TRUE)2                  -7.668e-04  9.125e-04  -0.840
## month_sin                                        1.578e-01  3.521e-02   4.481
## month_cos                                       -2.402e-02  3.471e-02  -0.692
## jobblue-collar                                  -3.628e-01  8.675e-02  -4.182
## jobentrepreneur                                 -4.827e-01  1.492e-01  -3.235
## jobhousemaid                                    -5.988e-01  1.652e-01  -3.626
## jobmanagement                                   -2.454e-01  8.851e-02  -2.772
## jobretired                                      -2.617e-01  1.292e-01  -2.025
## jobself-employed                                -4.900e-01  1.333e-01  -3.675
## jobservices                                     -2.212e-01  9.899e-02  -2.235
## jobstudent                                       5.294e-01  1.326e-01   3.994
## jobtechnician                                   -2.152e-01  8.252e-02  -2.607
## jobunemployed                                   -1.566e-01  1.318e-01  -1.188
## jobunknown                                      -4.246e-01  2.991e-01  -1.420
## maritalmarried                                  -2.280e-01  6.994e-02  -3.260
## maritalsingle                                   -4.111e-03  8.081e-02  -0.051
## educationsecondary                               1.179e-01  7.697e-02   1.532
## educationtertiary                                4.572e-01  8.982e-02   5.090
## educationunknown                                 2.002e-01  1.263e-01   1.586
## housingyes                                      -8.775e-01  5.249e-02 -16.718
## loanyes                                         -8.375e-01  1.035e-01  -8.095
## contacttelephone                                -2.713e-02  1.723e-01  -0.157
## contactunknown                                  -2.540e+00  2.166e-01 -11.726
## poutcomeother                                    3.879e-01  2.419e-01   1.604
## poutcomesuccess                                  2.254e+00  2.325e-01   9.694
## poutcomeunknown                                 -2.804e-01  1.622e-01  -1.729
## poly(duration, 2, raw = TRUE)1:poutcomeother    -1.270e-03  9.582e-04  -1.325
## poly(duration, 2, raw = TRUE)2:poutcomeother     9.226e-07  7.123e-07   1.295
## poly(duration, 2, raw = TRUE)1:poutcomesuccess  -3.033e-04  1.026e-03  -0.296
## poly(duration, 2, raw = TRUE)2:poutcomesuccess  -3.891e-07  7.841e-07  -0.496
## poly(duration, 2, raw = TRUE)1:poutcomeunknown  -6.056e-04  6.110e-04  -0.991
## poly(duration, 2, raw = TRUE)2:poutcomeunknown   1.252e-06  4.642e-07   2.697
## poly(duration, 2, raw = TRUE)1:contacttelephone -9.486e-04  5.419e-04  -1.751
## poly(duration, 2, raw = TRUE)2:contacttelephone  7.363e-07  2.875e-07   2.561
## poly(duration, 2, raw = TRUE)1:contactunknown    2.935e-03  5.368e-04   5.468
## poly(duration, 2, raw = TRUE)2:contactunknown   -8.659e-07  2.955e-07  -2.930
## housingyes:loanyes                               5.798e-01  1.395e-01   4.155
##                                                 Pr(>|z|)    
## (Intercept)                                     0.019553 *  
## poly(age, 2, raw = TRUE)1                        < 2e-16 ***
## poly(age, 2, raw = TRUE)2                        < 2e-16 ***
## poly(balance, 2, raw = TRUE)1                   2.12e-05 ***
## poly(balance, 2, raw = TRUE)2                   0.003355 ** 
## poly(duration, 2, raw = TRUE)1                   < 2e-16 ***
## poly(duration, 2, raw = TRUE)2                   < 2e-16 ***
## poly(campaign, 2, raw = TRUE)1                   < 2e-16 ***
## poly(campaign, 2, raw = TRUE)2                  0.000155 ***
## poly(previous, 2, raw = TRUE)1                  0.017389 *  
## poly(previous, 2, raw = TRUE)2                  0.400708    
## month_sin                                       7.42e-06 ***
## month_cos                                       0.488999    
## jobblue-collar                                  2.89e-05 ***
## jobentrepreneur                                 0.001217 ** 
## jobhousemaid                                    0.000288 ***
## jobmanagement                                   0.005569 ** 
## jobretired                                      0.042823 *  
## jobself-employed                                0.000238 ***
## jobservices                                     0.025414 *  
## jobstudent                                      6.50e-05 ***
## jobtechnician                                   0.009124 ** 
## jobunemployed                                   0.234843    
## jobunknown                                      0.155646    
## maritalmarried                                  0.001115 ** 
## maritalsingle                                   0.959427    
## educationsecondary                              0.125475    
## educationtertiary                               3.58e-07 ***
## educationunknown                                0.112843    
## housingyes                                       < 2e-16 ***
## loanyes                                         5.75e-16 ***
## contacttelephone                                0.874872    
## contactunknown                                   < 2e-16 ***
## poutcomeother                                   0.108759    
## poutcomesuccess                                  < 2e-16 ***
## poutcomeunknown                                 0.083871 .  
## poly(duration, 2, raw = TRUE)1:poutcomeother    0.185179    
## poly(duration, 2, raw = TRUE)2:poutcomeother    0.195204    
## poly(duration, 2, raw = TRUE)1:poutcomesuccess  0.767533    
## poly(duration, 2, raw = TRUE)2:poutcomesuccess  0.619745    
## poly(duration, 2, raw = TRUE)1:poutcomeunknown  0.321606    
## poly(duration, 2, raw = TRUE)2:poutcomeunknown  0.006998 ** 
## poly(duration, 2, raw = TRUE)1:contacttelephone 0.080004 .  
## poly(duration, 2, raw = TRUE)2:contacttelephone 0.010436 *  
## poly(duration, 2, raw = TRUE)1:contactunknown   4.56e-08 ***
## poly(duration, 2, raw = TRUE)2:contactunknown   0.003390 ** 
## housingyes:loanyes                              3.25e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22755  on 31645  degrees of freedom
## Residual deviance: 14818  on 31599  degrees of freedom
## AIC: 14912
## 
## Number of Fisher Scoring iterations: 7

Helper Functions

#' Simple Functions to avoid repeating a lot
#' Compute RMSE, LOGLOSS, and a Readeable Table for metrics
threshold <- 0.25

actual_preds <- factor(test_data$subscribed, levels = model_levels)

custom.rmse <- function(predicted, actual) {
  sqrt(mean((predicted - ifelse(actual == "yes", 1, 0))^2))
}

custom.logloss <- function(probs, actual, eps = 1e-15) {
  probs <- pmin(pmax(probs, eps), 1 - eps)
  actual_binary <- ifelse(actual == "yes", 1, 0)
  -mean(actual_binary * log(probs) + (1 - actual_binary) * log(1 - probs))
}

custom.results <- function(model_fit) {
  probs <- predict(model_fit, newdata = test_data, type = "prob")[, "yes"]
  classification <- factor(ifelse(probs > threshold, "yes", "no"), levels = model_levels)
  cm <- confusionMatrix(data = classification, reference = actual_preds, positive = "yes")
  
  list(
    Probabilities = probs,
    #Classification = classification,
    ConfusionMatrix = cm
  )
}

custom.metrics <- function(cm, fit, roc, probs) {
  sensitivity <- cm$byClass["Sensitivity"]
  ppv <- cm$byClass["Pos Pred Value"]
  f1_score <- if ((sensitivity + ppv) > 0) {
    2 * ((sensitivity * ppv) / (sensitivity + ppv))
  } else {
    NA
  }

  log_loss <- custom.logloss(probs, actual_preds)
  rmse <- custom.rmse(probs, actual_preds)

  aic <- NA
  if (!is.null(fit$finalModel) && "aic" %in% names(fit$finalModel)) {
    aic <- fit$finalModel$aic
  } else if (inherits(fit$finalModel, "glm")) {
    aic <- AIC(fit$finalModel)
  }

  data.frame(
    RMSE        = rmse,
    AIC         = aic,
    Accuracy    = cm$overall["Accuracy"],
    Sensitivity = sensitivity,
    Specificity = cm$byClass["Specificity"],
    PPV         = ppv,
    NPV         = cm$byClass["Neg Pred Value"],
    Prevalence  = cm$byClass["Prevalence"],
    AUROC       = auc(roc),
    LogLoss     = log_loss,
    F1          = f1_score
  )
}

Complex Logistic Regresion (GLM)

set.seed(42)
glm_fit <- train(
  candidate_model,
  data = train_data,
  method = "glm",
  family = "binomial",
  trControl = fitControl,
  metric = "logLoss"
)

glm_fit_selected <- train(
  selected_formula,
  data = train_data,
  method = "glm",
  family = "binomial",
  trControl = fitControl,
  metric = "logLoss"
)

glm_results <- custom.results(glm_fit)
glm_results_selected <- custom.results(glm_fit_selected)

LDA Model

set.seed(42)
lda_fit <- train(
  candidate_model,
  data = train_data,
  method = "lda",
  trControl = fitControl,
  metric = "logLoss"
)

lda_fit_selected <- train(
  selected_formula,
  data = train_data,
  method = "lda",
  trControl = fitControl,
  metric = "logLoss"
)

lda_results <- custom.results(lda_fit)
lda_results_selected <- custom.results(lda_fit_selected)

KNN

knn_fit <- train(
  rf_candidate_model,
  data = train_data,
  method = "knn",
  preProcess = c("center", "scale"),
  trControl = fitControl,
  tuneLength = 2
)

knn_results <- custom.results(knn_fit)

Random Forest

set.seed(42)
rf_fit <- train(
  rf_candidate_model,
  data = train_data,
  method = "ranger",
  trControl = fitControl,
  metric = "logLoss",
  tuneLength = 1
)

rf_results <- custom.results(rf_fit)

Performance Metrics

roc_lda <- roc(actual_preds, lda_results$Probabilities, levels = model_levels, direction = "<")
roc_glm <- roc(actual_preds, glm_results$Probabilities, levels = model_levels, direction = "<")
roc_rf <- roc(actual_preds, rf_results$Probabilities, levels = model_levels, direction = "<")
roc_knn <- roc(actual_preds, knn_results$Probabilities, levels = model_levels, direction = "<")

#' Hybrid Selected ROC
plot(roc_lda, col = base_palette[1], lwd = 2, main = "ROC Curve: LDA vs Logistic vs Random Forest")
lines(roc_glm, col = base_palette[2], lwd = 2)
lines(roc_rf, col = base_palette[3], lwd = 2)
lines(roc_knn, col = base_palette[4], lwd = 2)

legend("bottomright",
  legend = c(
    paste("LDA           (AUC =", round(auc(roc_lda), 3), ")"),
    paste("Logistic      (AUC =", round(auc(roc_glm), 3), ")"),
    paste("Random Forest (AUC =", round(auc(roc_rf), 3), ")"),
    paste("KNN           (AUC =", round(auc(roc_knn), 3), ")")
  ),
  col = base_palette[1:4],
  lwd = 4
)

Feature Selected ROC

#' Feature Selected ROC
roc_lda_selected <- roc(actual_preds, lda_results_selected$Probabilities, levels = model_levels, direction = "<")
roc_glm_selected <- roc(actual_preds, glm_results_selected$Probabilities, levels = model_levels, direction = "<")

#' Feature Selected ROC
plot(roc_lda_selected, col = base_palette[7], lwd = 2, main = "ROC Curve: LDA vs Logistic - (Feature Selected)")
lines(roc_glm_selected, col = base_palette[8], lwd = 2)

legend("bottomright",
  legend = c(
    paste("LDA           (AUC =", round(auc(roc_lda_selected), 3), ")"),
    paste("Logistic      (AUC =", round(auc(roc_glm_selected), 3), ")")
  ),
  col = base_palette[7:8],
  lwd = 4
)

Metrics Table Comparision

results_table <- rbind(
  LDA = custom.metrics(lda_results$ConfusionMatrix, lda_fit, roc_lda, lda_results$Probabilities),
  LDA.FeatureSelected = custom.metrics(lda_results_selected$ConfusionMatrix, lda_fit_selected, roc_lda_selected, lda_results_selected$Probabilities),
  Logistic = custom.metrics(glm_results$ConfusionMatrix, glm_fit, roc_glm, glm_results$Probabilities),
  Logistic.FeatureSelected = custom.metrics(glm_results_selected$ConfusionMatrix, glm_fit_selected, roc_glm_selected, glm_results_selected$Probabilities),
  RandomForest = custom.metrics(rf_results$ConfusionMatrix, rf_fit, roc_rf, rf_results$Probabilities),
  kNN = custom.metrics(knn_results$ConfusionMatrix, knn_fit, roc_knn, knn_results$Probabilities)
)
round(data.frame(t(results_table)), 4)